DATA PREPROCESSING
LOADING DATASET & CHECKING str():
library(readr)
credit_card <- read.csv("credit_card.csv")
print(str(credit_card))
## 'data.frame': 10127 obs. of 23 variables:
## $ CLIENTNUM : int 768805383 818770008 713982108 769911858 709106358 713061558 810347208 818906208 710930508 719661558 ...
## $ Attrition_Flag : chr "Existing Customer" "Existing Customer" "Existing Customer" "Existing Customer" ...
## $ Customer_Age : int 45 49 51 40 40 44 51 32 37 48 ...
## $ Gender : chr "M" "F" "M" "F" ...
## $ Dependent_count : int 3 5 3 4 3 2 4 0 3 2 ...
## $ Education_Level : chr "High School" "Graduate" "Graduate" "High School" ...
## $ Marital_Status : chr "Married" "Single" "Married" "Unknown" ...
## $ Income_Category : chr "$60K - $80K" "Less than $40K" "$80K - $120K" "Less than $40K" ...
## $ Card_Category : chr "Blue" "Blue" "Blue" "Blue" ...
## $ Months_on_book : int 39 44 36 34 21 36 46 27 36 36 ...
## $ Total_Relationship_Count : int 5 6 4 3 5 3 6 2 5 6 ...
## $ Months_Inactive_12_mon : int 1 1 1 4 1 1 1 2 2 3 ...
## $ Contacts_Count_12_mon : int 3 2 0 1 0 2 3 2 0 3 ...
## $ Credit_Limit : num 12691 8256 3418 3313 4716 ...
## $ Total_Revolving_Bal : int 777 864 0 2517 0 1247 2264 1396 2517 1677 ...
## $ Avg_Open_To_Buy : num 11914 7392 3418 796 4716 ...
## $ Total_Amt_Chng_Q4_Q1 : num 1.33 1.54 2.59 1.41 2.17 ...
## $ Total_Trans_Amt : int 1144 1291 1887 1171 816 1088 1330 1538 1350 1441 ...
## $ Total_Trans_Ct : int 42 33 20 20 28 24 31 36 24 32 ...
## $ Total_Ct_Chng_Q4_Q1 : num 1.62 3.71 2.33 2.33 2.5 ...
## $ Avg_Utilization_Ratio : num 0.061 0.105 0 0.76 0 0.311 0.066 0.048 0.113 0.144 ...
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1: num 9.34e-05 5.69e-05 2.11e-05 1.34e-04 2.17e-05 ...
## $ Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2: num 1 1 1 1 1 ...
## NULL
DIMENSIONS OF DATA:
dim(credit_card)
## [1] 10127 23
SUMMARY OF DATASET:
summary(credit_card)
## CLIENTNUM Attrition_Flag Customer_Age Gender
## Min. :7.08e+08 Length:10127 Min. :26.0 Length:10127
## 1st Qu.:7.13e+08 Class :character 1st Qu.:41.0 Class :character
## Median :7.18e+08 Mode :character Median :46.0 Mode :character
## Mean :7.39e+08 Mean :46.3
## 3rd Qu.:7.73e+08 3rd Qu.:52.0
## Max. :8.28e+08 Max. :73.0
## Dependent_count Education_Level Marital_Status Income_Category
## Min. :0.00 Length:10127 Length:10127 Length:10127
## 1st Qu.:1.00 Class :character Class :character Class :character
## Median :2.00 Mode :character Mode :character Mode :character
## Mean :2.35
## 3rd Qu.:3.00
## Max. :5.00
## Card_Category Months_on_book Total_Relationship_Count
## Length:10127 Min. :13.0 Min. :1.00
## Class :character 1st Qu.:31.0 1st Qu.:3.00
## Mode :character Median :36.0 Median :4.00
## Mean :35.9 Mean :3.81
## 3rd Qu.:40.0 3rd Qu.:5.00
## Max. :56.0 Max. :6.00
## Months_Inactive_12_mon Contacts_Count_12_mon Credit_Limit
## Min. :0.00 Min. :0.00 Min. : 1438
## 1st Qu.:2.00 1st Qu.:2.00 1st Qu.: 2555
## Median :2.00 Median :2.00 Median : 4549
## Mean :2.34 Mean :2.46 Mean : 8632
## 3rd Qu.:3.00 3rd Qu.:3.00 3rd Qu.:11068
## Max. :6.00 Max. :6.00 Max. :34516
## Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
## Min. : 0 Min. : 3 Min. :0.00 Min. : 510
## 1st Qu.: 359 1st Qu.: 1324 1st Qu.:0.63 1st Qu.: 2156
## Median :1276 Median : 3474 Median :0.74 Median : 3899
## Mean :1163 Mean : 7469 Mean :0.76 Mean : 4404
## 3rd Qu.:1784 3rd Qu.: 9859 3rd Qu.:0.86 3rd Qu.: 4741
## Max. :2517 Max. :34516 Max. :3.40 Max. :18484
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## Min. : 10.0 Min. :0.00 Min. :0.000
## 1st Qu.: 45.0 1st Qu.:0.58 1st Qu.:0.023
## Median : 67.0 Median :0.70 Median :0.176
## Mean : 64.9 Mean :0.71 Mean :0.275
## 3rd Qu.: 81.0 3rd Qu.:0.82 3rd Qu.:0.503
## Max. :139.0 Max. :3.71 Max. :0.999
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
## Min. :0.00
## 1st Qu.:0.00
## Median :0.00
## Mean :0.16
## 3rd Qu.:0.00
## Max. :1.00
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
## Min. :0.00
## 1st Qu.:1.00
## Median :1.00
## Mean :0.84
## 3rd Qu.:1.00
## Max. :1.00
COLUMN NAMES IN THE DATASET:
column_names <- colnames(credit_card)
last_two_columns <- tail(column_names, 2)
print(column_names)
## [1] "CLIENTNUM"
## [2] "Attrition_Flag"
## [3] "Customer_Age"
## [4] "Gender"
## [5] "Dependent_count"
## [6] "Education_Level"
## [7] "Marital_Status"
## [8] "Income_Category"
## [9] "Card_Category"
## [10] "Months_on_book"
## [11] "Total_Relationship_Count"
## [12] "Months_Inactive_12_mon"
## [13] "Contacts_Count_12_mon"
## [14] "Credit_Limit"
## [15] "Total_Revolving_Bal"
## [16] "Avg_Open_To_Buy"
## [17] "Total_Amt_Chng_Q4_Q1"
## [18] "Total_Trans_Amt"
## [19] "Total_Trans_Ct"
## [20] "Total_Ct_Chng_Q4_Q1"
## [21] "Avg_Utilization_Ratio"
## [22] "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1"
## [23] "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"
DATA TYPES OF COLUMNS:
column_data_types <- sapply(credit_card, class)
print(column_data_types)
## CLIENTNUM
## "integer"
## Attrition_Flag
## "character"
## Customer_Age
## "integer"
## Gender
## "character"
## Dependent_count
## "integer"
## Education_Level
## "character"
## Marital_Status
## "character"
## Income_Category
## "character"
## Card_Category
## "character"
## Months_on_book
## "integer"
## Total_Relationship_Count
## "integer"
## Months_Inactive_12_mon
## "integer"
## Contacts_Count_12_mon
## "integer"
## Credit_Limit
## "numeric"
## Total_Revolving_Bal
## "integer"
## Avg_Open_To_Buy
## "numeric"
## Total_Amt_Chng_Q4_Q1
## "numeric"
## Total_Trans_Amt
## "integer"
## Total_Trans_Ct
## "integer"
## Total_Ct_Chng_Q4_Q1
## "numeric"
## Avg_Utilization_Ratio
## "numeric"
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1
## "numeric"
## Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
## "numeric"
AFTER DROPING UNNECESSARY COLUMNS:
credit_card_final <- subset(credit_card,select = !colnames(credit_card) %in% last_two_columns)
print(colnames(credit_card_final))
## [1] "CLIENTNUM" "Attrition_Flag"
## [3] "Customer_Age" "Gender"
## [5] "Dependent_count" "Education_Level"
## [7] "Marital_Status" "Income_Category"
## [9] "Card_Category" "Months_on_book"
## [11] "Total_Relationship_Count" "Months_Inactive_12_mon"
## [13] "Contacts_Count_12_mon" "Credit_Limit"
## [15] "Total_Revolving_Bal" "Avg_Open_To_Buy"
## [17] "Total_Amt_Chng_Q4_Q1" "Total_Trans_Amt"
## [19] "Total_Trans_Ct" "Total_Ct_Chng_Q4_Q1"
## [21] "Avg_Utilization_Ratio"
CHECKING FOR NULL VALUES
null_counts <- colSums(is.na(credit_card_final))
print(null_counts)
## CLIENTNUM Attrition_Flag Customer_Age
## 0 0 0
## Gender Dependent_count Education_Level
## 0 0 0
## Marital_Status Income_Category Card_Category
## 0 0 0
## Months_on_book Total_Relationship_Count Months_Inactive_12_mon
## 0 0 0
## Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal
## 0 0 0
## Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
## 0 0 0
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 0 0 0
REPLACING SPACES WITH NA AND CHECKING FOR NULL VALUES OR UNFILLED
VALUES
credit_card_final[credit_card_final == " "] <- NA
null_counts1 <- colSums(is.na(credit_card_final))
print(null_counts1)
## CLIENTNUM Attrition_Flag Customer_Age
## 0 0 0
## Gender Dependent_count Education_Level
## 0 0 0
## Marital_Status Income_Category Card_Category
## 0 0 0
## Months_on_book Total_Relationship_Count Months_Inactive_12_mon
## 0 0 0
## Contacts_Count_12_mon Credit_Limit Total_Revolving_Bal
## 0 0 0
## Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1 Total_Trans_Amt
## 0 0 0
## Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 0 0 0
CATEGORICAL VARIABLES:
library(dplyr)
# Identify categorical variables (regardless of data type)
categorical_variables <- credit_card_final %>%
select_if(function(col) is.character(col) || is.factor(col)) %>%
names()
print(categorical_variables)
## [1] "Attrition_Flag" "Gender" "Education_Level" "Marital_Status"
## [5] "Income_Category" "Card_Category"
NUMERICAL VARIABLES:
library(dplyr)
# Identify numerical variables (excluding character and factor)
numerical_variables <- credit_card_final %>%
select_if(function(col) is.numeric(col)) %>%
names()
# Separate continuous and discrete variables
continuous_variables <- credit_card_final %>%
select(numerical_variables) %>%
summarise_all(function(col) isTRUE(all.equal(col, round(col))))
discrete_variables <- setdiff(numerical_variables, continuous_variables)
# Combine continuous and discrete variables into a single output
variables_summary <- list(Continuous = continuous_variables, Discrete = discrete_variables)
# Print the variables summary
print(discrete_variables)
## [1] "CLIENTNUM" "Customer_Age"
## [3] "Dependent_count" "Months_on_book"
## [5] "Total_Relationship_Count" "Months_Inactive_12_mon"
## [7] "Contacts_Count_12_mon" "Credit_Limit"
## [9] "Total_Revolving_Bal" "Avg_Open_To_Buy"
## [11] "Total_Amt_Chng_Q4_Q1" "Total_Trans_Amt"
## [13] "Total_Trans_Ct" "Total_Ct_Chng_Q4_Q1"
## [15] "Avg_Utilization_Ratio"
print(continuous_variables)
## CLIENTNUM Customer_Age Dependent_count Months_on_book
## 1 TRUE TRUE TRUE TRUE
## Total_Relationship_Count Months_Inactive_12_mon Contacts_Count_12_mon
## 1 TRUE TRUE TRUE
## Credit_Limit Total_Revolving_Bal Avg_Open_To_Buy Total_Amt_Chng_Q4_Q1
## 1 FALSE TRUE FALSE FALSE
## Total_Trans_Amt Total_Trans_Ct Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1 TRUE TRUE FALSE FALSE
EDA –> “Exploratory Data Analysis”
DATA DISTRIBUTION OF CATEGORICAL VARIABLES:
library(ggplot2)
categorical_vars_ggplot <- c("Attrition_Flag", "Gender", "Education_Level", "Marital_Status", "Income_Category", "Card_Category")
for (cat_var in categorical_vars_ggplot) {
plot_obj <- ggplot(credit_card_final, aes_string(x = cat_var, fill = cat_var)) +
geom_bar() +
geom_text(stat='count', aes_string(label='..count..', y='..count..'), vjust=-0.5) +
labs(title = paste("Distribution of", cat_var), x = cat_var, y = "Count") +
scale_fill_brewer(palette="Set3") +
theme_minimal() +
theme(legend.position="none")
print(plot_obj)
}






# Load necessary libraries
library(ggplot2)
# Create a bar graph
ggplot(credit_card_final, aes(x = Card_Category, y = Credit_Limit, fill = Card_Category)) +
geom_bar(stat = "identity") +
labs(title = "Credit Limit by Card Category",
x = "Card Category",
y = "Credit Limit") +
theme_minimal()

# Load necessary libraries
#library(ggplot2)
# Create a bar graph with facets
#ggplot(credit_card_final, aes(x = Gender, y = Avg_Utilization_Ratio, fill = Gender)) +
# geom_bar(stat = "identity") +
#labs(title = "Avg Utilization Ratio and Avg Open To Buy by Gender",
# x = "Gender",
# y = "Avg Utilization Ratio") +
#facet_wrap(~ Avg_Open_To_Buy, ncol = 3) +
#theme_minimal()
CUSTOMER SEGMENTATION:
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Select relevant attributes for segmentation
selected_attributes <- credit_card_final %>%
select(Customer_Age, Income_Category, Card_Category)
# Define the segmentation criteria (you can adjust these criteria as needed)
young_high_income <- with(credit_card_final, Customer_Age < 35 & Income_Category >= 4)
middle_age_high_income <- with(credit_card_final, Customer_Age >= 35 & Customer_Age <= 60 & Income_Category >= 4)
senior_high_income <- with(credit_card_final, Customer_Age > 60 & Income_Category >= 4)
young_low_income <- with(credit_card_final, Customer_Age < 35 & Income_Category < 4)
middle_age_low_income <- with(credit_card_final, Customer_Age >= 35 & Customer_Age <= 60 & Income_Category < 4)
senior_low_income <- with(credit_card_final, Customer_Age > 60 & Income_Category < 4)
# Assign segments to the original dataset
credit_card_final$Segment <- ifelse(young_high_income, "Young High Income",
ifelse(middle_age_high_income, "Middle Age High Income",
ifelse(senior_high_income, "Senior High Income",
ifelse(young_low_income, "Young Low Income",
ifelse(middle_age_low_income, "Middle Age Low Income",
ifelse(senior_low_income, "Senior Low Income", NA))))))
# Visualize the segmentation
ggplot(credit_card_final, aes(x = Customer_Age, y = Income_Category, color = Segment)) +
geom_point() +
labs(title = "Customer Segmentation by Age and Income Category")

# Summary statistics for each segment
segment_summary <- credit_card_final %>%
group_by(Segment) %>%
summarise(
Average_Age = mean(Customer_Age),
Average_Income = mean(Income_Category)
)
print(segment_summary)
## # A tibble: 6 × 3
## Segment Average_Age Average_Income
## <chr> <dbl> <dbl>
## 1 Middle Age High Income 46.9 NA
## 2 Middle Age Low Income 46.8 NA
## 3 Senior High Income 63.4 NA
## 4 Senior Low Income 62.5 NA
## 5 Young High Income 30.6 NA
## 6 Young Low Income 31.5 NA
# Segment customers based on Card Category
credit_card_final$Card_Segment <- ifelse(credit_card_final$Card_Category == "Blue", "Blue Card",
ifelse(credit_card_final$Card_Category == "Silver", "Silver Card",
ifelse(credit_card_final$Card_Category == "Gold", "Gold Card",
ifelse(credit_card_final$Card_Category == "Platinum", "Platinum Card", NA))))
# Visualize the Card Category segmentation
ggplot(credit_card_final, aes(x = Customer_Age, y = Income_Category, color = Card_Segment)) +
geom_point() +
labs(title = "Customer Segmentation by Age and Income Category (Card Category)")

# Summary statistics for each Card Category segment
card_category_summary <- credit_card_final %>%
group_by(Card_Segment) %>%
summarise(
Average_Age = mean(Customer_Age),
Average_Income = mean(Income_Category)
)
print(card_category_summary)
## # A tibble: 4 × 3
## Card_Segment Average_Age Average_Income
## <chr> <dbl> <dbl>
## 1 Blue Card 46.4 NA
## 2 Gold Card 45.4 NA
## 3 Platinum Card 47.5 NA
## 4 Silver Card 45.7 NA
DATA DISTRIBUTION OF NUMERICAL VARIABLES:
library(ggplot2)
# Define a list of column names and corresponding bin widths
column_binwidths <- list(
"Customer_Age" = 3,
"Total_Trans_Amt" = 500,
"Total_Trans_Ct" = 7,
"Credit_Limit" = 800
)
# Loop through the columns and create histograms with respective bin widths and density curves
for (column in names(column_binwidths)) {
# Get the bin width for the current column
binwidth <- column_binwidths[[column]]
# Create a histogram with density curve
hist_plot <- ggplot(credit_card_final, aes(x = .data[[column]])) +
geom_histogram(binwidth = binwidth, fill = "pink", color = "black", aes(y = ..density..)) +
geom_density(alpha = 0.5, color = "green") +
labs(title = paste("Histogram with Density Curve of", column)) +
theme_minimal()
# Display the histogram with density curve
print(hist_plot)
}




CORRELATION ANALYSIS:
#install.packages("corrplot")
library(corrplot)
# Calculate the correlation matrix for numerical variables
correlation_matrix <- cor(credit_card_final[, numerical_variables])
# Create a correlation plot
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 50)

# Display the correlation plot
# Calculate the correlation matrix
correlation_matrix <- cor(credit_card_final[, numerical_variables])
# Find highly correlated pairs
threshold <- 0.7 # Set your desired correlation threshold
# Create an empty matrix to store the results
related_pairs <- matrix(nrow = 0, ncol = 2)
# Loop through the correlation matrix to find related pairs
for (i in 1:(length(numerical_variables) - 1)) {
for (j in (i + 1):length(numerical_variables)) {
if (abs(correlation_matrix[i, j]) >= threshold) {
related_pairs <- rbind(related_pairs, c(numerical_variables[i], numerical_variables[j]))
}
}
}
# Display related variable pairs
print(related_pairs)
## [,1] [,2]
## [1,] "Customer_Age" "Months_on_book"
## [2,] "Credit_Limit" "Avg_Open_To_Buy"
## [3,] "Total_Trans_Amt" "Total_Trans_Ct"
CORRELATION COEFFICIENTS:
# Define the pairs of variables
variable_pairs <- list(
c("Customer_Age", "Months_on_book"),
c("Credit_Limit", "Avg_Open_To_Buy"),
c("Total_Trans_Amt", "Total_Trans_Ct")
)
# Function to calculate and display correlation
calculate_and_display_correlation <- function(pair) {
variable1 <- pair[1]
variable2 <- pair[2]
# Extract data for the pair
data_pair <- credit_card_final[, c(variable1, variable2)]
# Calculate correlation
correlation_coefficient <- cor(data_pair[[variable1]], data_pair[[variable2]])
# Print the correlation result
cat("Correlation between", variable1, "and", variable2, "is", correlation_coefficient, "\n")
}
# Loop through variable pairs and calculate/display correlations
for (pair in variable_pairs) {
calculate_and_display_correlation(pair)
}
## Correlation between Customer_Age and Months_on_book is 0.789
## Correlation between Credit_Limit and Avg_Open_To_Buy is 0.996
## Correlation between Total_Trans_Amt and Total_Trans_Ct is 0.807
PLOTTING THE PAIRS:
# Load necessary libraries
library(ggplot2)
# List of related variable pairs
related_pairs <- list(
list("Customer_Age", "Months_on_book"),
list("Credit_Limit", "Avg_Open_To_Buy"),
list("Total_Trans_Amt", "Total_Trans_Ct")
)
# Create a function to plot scatter plots with regression lines for a pair of related variables
plot_scatter_with_regression <- function(pair) {
# Extract variable names
var1 <- pair[[1]]
var2 <- pair[[2]]
# Create a scatter plot
scatter_plot <- ggplot(credit_card_final, aes(x = .data[[var1]], y = .data[[var2]])) +
geom_point(alpha = 0.6, size = 3, color = "orchid") +
geom_smooth(method = "lm", se = FALSE, color = "dodgerblue", size = 1) +
labs(
title = paste("Scatter Plot of", var1, "vs", var2),
x = var1,
y = var2
) +
theme_minimal()
# Set the size of the plot
options(repr.plot.width = 6, repr.plot.height = 4)
# Display the scatter plot with a regression line
print(scatter_plot)
}
# Loop through related variable pairs and create individual scatter plots
for (pair in related_pairs) {
plot_scatter_with_regression(pair)
}



# Load necessary libraries
library(ggplot2)
library(gridExtra)
# Define the pairs of variables
variable_pairs <- list(
c("Customer_Age", "Months_on_book"),
c("Credit_Limit", "Avg_Open_To_Buy"),
c("Total_Trans_Amt", "Total_Trans_Ct")
)
# Create a function to plot histograms and QQ-plots for a variable pair
plot_histogram_qqpair <- function(pair) {
variable1 <- pair[1]
variable2 <- pair[2]
# Extract data for the pair
data_pair <- credit_card_final[, c(variable1, variable2)]
# Create a histogram for variable 1
hist_var1 <- ggplot(data_pair, aes(x = .data[[variable1]])) +
geom_histogram(binwidth = 10, color = "black", fill = "darkgoldenrod") +
labs(title = paste("Histogram of", variable1)) +
theme_minimal()
# Create a histogram for variable 2
hist_var2 <- ggplot(data_pair, aes(x = .data[[variable2]])) +
geom_histogram(binwidth = 10, color = "black", fill = "tomato") +
labs(title = paste("Histogram of", variable2)) +
theme_minimal()
# Create a QQ-plot
qqplot <- ggplot(data_pair, aes(sample = .data[[variable1]])) +
geom_qq() +
geom_qq_line(color = "darkolivegreen") +
labs(title = paste("QQ-Plot of", variable1, "vs Normal Distribution")) +
theme_minimal()
# Arrange histograms and QQ-plot in one grid
grid.arrange(hist_var1, hist_var2, qqplot, ncol = 2)
}
# Loop through variable pairs and plot histograms and QQ-plots
for (pair in variable_pairs) {
plot_histogram_qqpair(pair)
}



OUTLIER DETECTION AND REMOVAL:
# Create a copy of the dataset to avoid modifying the original data
credit_card_no_outliers <- credit_card_final
# Outlier Identifiaction and removal from Customer Age
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Customer_Age, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 2
## Proportion (%) of outliers: 0
## Mean of the outliers: 71.5
## Mean without removing outliers: 46.3
## Mean if we remove outliers: 46.3
## Outliers successfully removed
credit_card_no_outliers[["Customer_Age"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Credit Limit
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Credit_Limit, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 984
## Proportion (%) of outliers: 10.8
## Mean of the outliers: 31551
## Mean without removing outliers: 8632
## Mean if we remove outliers: 6165
## Outliers successfully removed
credit_card_no_outliers[["Credit_Limit"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Months on Book
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Months_on_book, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 386
## Proportion (%) of outliers: 4
## Mean of the outliers: 35.5
## Mean without removing outliers: 35.9
## Mean if we remove outliers: 35.9
## Outliers successfully removed
credit_card_no_outliers[["Months_on_book"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Total Revolving Balance
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Revolving_Bal, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 0
## Proportion (%) of outliers: 0
## Mean of the outliers: NaN
## Mean without removing outliers: 1163
## Mean if we remove outliers: 1163
## Outliers successfully removed
credit_card_no_outliers[["Total_Revolving_Bal"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Total Transaction Amount
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Trans_Amt, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 896
## Proportion (%) of outliers: 9.7
## Mean of the outliers: 13770
## Mean without removing outliers: 4404
## Mean if we remove outliers: 3495
## Outliers successfully removed
credit_card_no_outliers[["Total_Trans_Amt"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Total Transaction Ct
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Total_Trans_Ct, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 2
## Proportion (%) of outliers: 0
## Mean of the outliers: 138
## Mean without removing outliers: 64.9
## Mean if we remove outliers: 64.8
## Outliers successfully removed
credit_card_no_outliers[["Total_Trans_Ct"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Average Open to Buy
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Avg_Open_To_Buy, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 963
## Proportion (%) of outliers: 10.5
## Mean of the outliers: 30532
## Mean without removing outliers: 7469
## Mean if we remove outliers: 5046
## Outliers successfully removed
credit_card_no_outliers[["Avg_Open_To_Buy"]][df_outliers$outliers] <- NA
# Outlier Identifiaction and removal from Average Utilization Ratio
df_outliers <- ezids::outlierKD2(credit_card_no_outliers,Avg_Utilization_Ratio, rm = TRUE, boxplt = TRUE, qqplt = TRUE)

## Outliers identified: 0
## Proportion (%) of outliers: 0
## Mean of the outliers: NaN
## Mean without removing outliers: 0.27
## Mean if we remove outliers: 0.27
## Outliers successfully removed
credit_card_no_outliers[["Avg_Utilization_Ratio"]][df_outliers$outliers] <- NA
library(ggplot2)
library(gridExtra)
# Define the pairs of variables
variable_pairs <- list(
c("Customer_Age", "Months_on_book"),
c("Credit_Limit", "Avg_Open_To_Buy"),
c("Total_Trans_Amt", "Total_Trans_Ct")
)
# Create a function to plot histograms and QQ-plots for a variable pair
plot_histogram_qqpair <- function(pair) {
variable1 <- pair[1]
variable2 <- pair[2]
# Extract data for the pair
data_pair <- credit_card_no_outliers[, c(variable1, variable2)]
# Create a histogram for variable 1
hist_var1 <- ggplot(data_pair, aes(x = .data[[variable1]])) +
geom_histogram(binwidth = 10, color = "black", fill = "darkgoldenrod") +
labs(title = paste("Histogram of", variable1)) +
theme_minimal()
# Create a histogram for variable 2
hist_var2 <- ggplot(data_pair, aes(x = .data[[variable2]])) +
geom_histogram(binwidth = 10, color = "black", fill = "tomato") +
labs(title = paste("Histogram of", variable2)) +
theme_minimal()
# Create a QQ-plot
qqplot <- ggplot(data_pair, aes(sample = .data[[variable1]])) +
geom_qq() +
geom_qq_line(color = "darkolivegreen") +
labs(title = paste("QQ-Plot of", variable1, "vs Normal Distribution")) +
theme_minimal()
# Arrange histograms and QQ-plot in one grid
grid.arrange(hist_var1, hist_var2, qqplot, ncol = 2)
}
# Loop through variable pairs and plot histograms and QQ-plots
for (pair in variable_pairs) {
plot_histogram_qqpair(pair)
}



# Calculate the standard deviation for each specified column
std_dev_Customer_Age <- sd(credit_card_no_outliers$Customer_Age)
std_dev_Credit_Limit <- sd(credit_card_no_outliers$Credit_Limit)
std_dev_Months_on_book <- sd(credit_card_no_outliers$Months_on_book)
std_dev_Avg_Open_To_Buy <- sd(credit_card_no_outliers$Avg_Open_To_Buy)
std_dev_Avg_Utilization_Ratio <- sd(credit_card_no_outliers$Avg_Utilization_Ratio)
std_dev_Total_Revolving_Bal <- sd(credit_card_no_outliers$Total_Revolving_Bal)
std_dev_Total_Trans_Amt <- sd(credit_card_no_outliers$Total_Trans_Amt)
std_dev_Total_Trans_Ct <- sd(credit_card_no_outliers$Total_Trans_Ct)
# Create a data frame with column names and standard deviations
std_dev_table <- data.frame(
Column = c("Customer_Age", "Credit_Limit", "Months_on_book", "Avg_Open_To_Buy",
"Avg_Utilization_Ratio", "Total_Revolving_Bal", "Total_Trans_Amt", "Total_Trans_Ct"),
Standard_Deviation = c(std_dev_Customer_Age, std_dev_Credit_Limit, std_dev_Months_on_book, std_dev_Avg_Open_To_Buy,
std_dev_Avg_Utilization_Ratio, std_dev_Total_Revolving_Bal, std_dev_Total_Trans_Amt, std_dev_Total_Trans_Ct)
)
# Load the knitr package
library(knitr)
# Print the table using knitr
print(std_dev_table, align = "c", col.names = c("Column", "Standard Deviation"), caption = "Standard Deviations")
## Column Standard_Deviation
## 1 Customer_Age 8.017
## 2 Credit_Limit 9088.777
## 3 Months_on_book 7.986
## 4 Avg_Open_To_Buy 9090.685
## 5 Avg_Utilization_Ratio 0.276
## 6 Total_Revolving_Bal 814.987
## 7 Total_Trans_Amt 3397.129
## 8 Total_Trans_Ct 23.473
# Perform t-test
t_test_result <- t.test(credit_card_no_outliers$Total_Trans_Amt ~ credit_card_no_outliers$Gender)
# Print the results
print("T-test for Total_Trans_Amt and Gender")
## [1] "T-test for Total_Trans_Amt and Gender"
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: credit_card_no_outliers$Total_Trans_Amt by credit_card_no_outliers$Gender
## t = -2, df = 8914, p-value = 0.01
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -303.9 -34.9
## sample estimates:
## mean in group F mean in group M
## 4324 4494
# Create a contingency table of income category and churn status
table_data <- table(credit_card_no_outliers$Income_Category, credit_card_no_outliers$Income_Category)
# Perform chi-square test
chi_sq_test <- t.test(table_data)
# Print the test results
chi_sq_test
##
## One Sample t-test
##
## data: table_data
## t = 2, df = 35, p-value = 0.03
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 31.2 531.4
## sample estimates:
## mean of x
## 281
# Plot 1
ggplot(credit_card_final, aes(x=Credit_Limit, y=Avg_Utilization_Ratio)) +
geom_point(alpha=0.5, colour='blue', shape=4) +
ggtitle("Example Plot") +
xlab("Credit Limit") +
ylab("Average Utilization Ratio") +
theme_minimal()

# Plot 2
ggplot(credit_card_final[credit_card_final$Income_Category != "Unknown", ],
aes(x = Customer_Age, y = Credit_Limit, color = Income_Category)) +
geom_point(alpha = 0.6) +
ggtitle("Relation Between Credit Limit Vs Age Coloured by Income_Category") +
xlab("Age") +
ylab("Credit Limit") +
scale_colour_viridis_d() +
theme_minimal() +
theme(legend.position = "right")

# Perform t-test for Contacts_Count_12_mon between attrition and non-attrition groups
attrition_group <- credit_card_final$Contacts_Count_12_mon[credit_card_final$Attrition_Flag == "Attrited Customer"]
non_attrition_group <- credit_card_final$Contacts_Count_12_mon[credit_card_final$Attrition_Flag == "Existing Customer"]
t_test_result <- t.test(attrition_group, non_attrition_group)
# Print t-test results
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: attrition_group and non_attrition_group
## t = 21, df = 2280, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.558 0.674
## sample estimates:
## mean of x mean of y
## 2.97 2.36
contingency_table_income <- table(credit_card_no_outliers$Income_Category , credit_card_no_outliers$Attrition_Flag)
ChiS2_test_income <- chisq.test(contingency_table_income)
print(contingency_table_income)
##
## Attrited Customer Existing Customer
## $120K + 126 601
## $40K - $60K 271 1519
## $60K - $80K 189 1213
## $80K - $120K 242 1293
## Less than $40K 612 2949
## Unknown 187 925
print(ChiS2_test_income)
##
## Pearson's Chi-squared test
##
## data: contingency_table_income
## X-squared = 13, df = 5, p-value = 0.03
Attrition_rate <- (contingency_table_income[,"Attrited Customer"]/ (contingency_table_income[,"Attrited Customer"]+contingency_table_income[,"Existing Customer"]))*100
print(Attrition_rate)
## $120K + $40K - $60K $60K - $80K $80K - $120K Less than $40K
## 17.3 15.1 13.5 15.8 17.2
## Unknown
## 16.8
# load libraries
library(dplyr)
library(ggplot2)
# group data
grouped_df <- credit_card_no_outliers %>%
group_by(Income_Category) %>%
summarise(Transaction_mean = mean(Total_Trans_Amt, na.rm = TRUE),
Transaction_median = median(Total_Trans_Amt, na.rm = TRUE),
n = n())
# plotting
ggplot(grouped_df, aes(x=Income_Category)) +
geom_col(aes(y=Transaction_mean, fill="Mean"), width=0.4, position=position_dodge(width=0.5)) +
geom_col(aes(y=Transaction_median, fill="Median"), width=0.4, position=position_dodge(width=0.5)) +
labs(fill='Transaction', y='Transaction Value', title='Transaction Behaviour by Income Category') +
theme_bw() +
theme(axis.text.x = element_text(angle=45, hjust=1))

male_utilization <- credit_card_no_outliers[credit_card_no_outliers$Gender=="F",]$Avg_Utilization_Ratio
female_utilization <- credit_card_no_outliers[credit_card_no_outliers$Gender=="M",]$Avg_Utilization_Ratio
tt_result <- t.test(male_utilization, female_utilization)
print(tt_result)
##
## Welch Two Sample t-test
##
## data: male_utilization and female_utilization
## t = 27, df = 9994, p-value <2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.132 0.153
## sample estimates:
## mean of x mean of y
## 0.342 0.200